Ten notatnik powstał w oparciu o https://www.kaggle.com/alpertml/credit-card-customers-eda-ml-97-5-accuracy/notebook#Exploring-the-Data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns
plt.style.use('ggplot')
from scipy import stats
from scipy.stats import norm
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import math
import warnings
warnings.filterwarnings('ignore')
input_df = pd.read_csv('BankChurners.csv')
display(input_df.shape)
display(input_df.sample(5))
input_df.drop('CLIENTNUM', axis=1, inplace=True)
input_df.drop(['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1'],
inplace=True, axis=1)
input_df.drop(['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],
inplace=True, axis=1)
display(input_df.shape)
(10127, 23)
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | ... | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 | Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5924 | 718469658 | Existing Customer | 37 | F | 2 | Uneducated | Married | Unknown | Blue | 22 | ... | 7927.0 | 996 | 6931.0 | 0.521 | 4094 | 85 | 0.771 | 0.126 | 0.000311 | 0.99969 |
| 3494 | 796791258 | Existing Customer | 50 | M | 1 | Uneducated | Married | $120K + | Blue | 44 | ... | 3104.0 | 1791 | 1313.0 | 1.048 | 4006 | 76 | 0.810 | 0.577 | 0.000315 | 0.99968 |
| 950 | 788814783 | Existing Customer | 39 | M | 1 | Uneducated | Single | $60K - $80K | Blue | 22 | ... | 9204.0 | 845 | 8359.0 | 0.673 | 1820 | 58 | 0.611 | 0.092 | 0.000292 | 0.99971 |
| 5477 | 711379083 | Existing Customer | 60 | F | 0 | Doctorate | Single | Less than $40K | Blue | 42 | ... | 2758.0 | 1429 | 1329.0 | 0.718 | 5124 | 74 | 0.644 | 0.518 | 0.000376 | 0.99962 |
| 5262 | 709486158 | Existing Customer | 33 | F | 3 | Graduate | Married | Less than $40K | Blue | 20 | ... | 1438.3 | 0 | 1438.3 | 0.896 | 4530 | 72 | 0.636 | 0.000 | 0.000096 | 0.99990 |
5 rows × 23 columns
(10127, 20)
updated_df = pd.DataFrame()
cats = ['Attrition_Flag', 'Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']
numeric_columns = ['Customer_Age','Credit_Limit','Months_on_book','Avg_Utilization_Ratio','Avg_Open_To_Buy','Total_Trans_Amt','Dependent_count',
'Total_Relationship_Count','Months_Inactive_12_mon','Contacts_Count_12_mon','Total_Revolving_Bal',
'Total_Amt_Chng_Q4_Q1','Total_Trans_Ct','Total_Ct_Chng_Q4_Q1']
def tobinary():
# full_df['Attrition_Flag'] = full_df.Attrition_Flag // same thing
updated_df['Attrition'] = input_df.Attrition_Flag.map({'Existing Customer':1, 'Attrited Customer':0})
updated_df['Gender'] = input_df.Gender.map({'M':1, 'F':0})
def stringtoint():
missing_income = input_df['Income_Category'].replace({'Unknown': 1 , 'Less than $40K':0, '$40K - $60K':0,
'$80K - $120K':0, '$60K - $80K':0, '$120K +':0})
#missinng data will be replaced with mode:
income_data = input_df['Income_Category'].replace({'Unknown': 1 , 'Less than $40K':1, '$40K - $60K':2,
'$80K - $120K':3, '$60K - $80K':4, '$120K +':5})
missing_education = input_df['Education_Level'].replace({'Unknown': 1, 'High School':0, 'Graduate':0, 'Uneducated':0,
'College':0,'Post-Graduate':0,'Doctorate':0})
#missinng data will be replaced with mode:
education_data = input_df['Education_Level'].replace({'Unknown': 2, 'High School':1, 'Graduate':2, 'Uneducated':3,
'College':4,'Post-Graduate':5,'Doctorate':6})
updated_df['Missing_Income'] = missing_income
updated_df['Income_Category'] = income_data
updated_df['Missing_Education'] = missing_education
updated_df['Education_Level'] = education_data
def encode():
global updated_df
card_dummies = pd.get_dummies(input_df['Card_Category'], prefix='Card')
marital_dummies = pd.get_dummies(input_df['Marital_Status'], prefix='Marital')
updated_df = pd.concat([updated_df, marital_dummies, card_dummies], axis=1)
def concat_with_numerics():
global updated_df
updated_df = pd.concat([updated_df, input_df.loc[:, numeric_columns]], axis=1)
tobinary()
stringtoint()
encode()
concat_with_numerics()
print('Data shapes """including target value"""')
print(f'Old shape : {input_df.shape}')
print(f'Updated shape : {updated_df.shape}')
Data shapes """including target value""" Old shape : (10127, 20) Updated shape : (10127, 28)
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, cross_validate
def estimates(X_data, y_data, models, cv):
train_acc_dict = dict()
test_acc_dict = dict()
time_dict = dict()
for model in models:
current_model_name = model.__class__.__name__
cv_results = cross_validate(model, X_data, y_data, cv=cv,
return_train_score=True, scoring='accuracy')
train_acc_dict[current_model_name] = cv_results['train_score'].mean()
test_acc_dict[current_model_name] = cv_results['test_score'].mean()
time_dict[current_model_name] = cv_results['fit_time'].mean()
return train_acc_dict, test_acc_dict, time_dict
m_xgb = xgb.XGBClassifier(n_estimators=2250,
max_depth=2, random_state=14)
cv = StratifiedKFold(11, shuffle=True, random_state=14)
models = [m_xgb]
X = updated_df.drop('Attrition', axis=1)
y = updated_df['Attrition']
print(X.shape)
print(y.shape)
train_acc_dict, test_acc_dict, time_dict = estimates(X, y, models, cv)
(10127, 27) (10127,) [14:15:19] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [14:15:25] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [14:15:40] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [14:15:46] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [14:15:52] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [14:15:57] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [14:16:04] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [14:16:10] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [14:16:16] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [14:16:33] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [14:16:39] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
# Test accuracy
for key, value in test_acc_dict.items():
print('{} - {:.1f}%'.format(key, value*100))
XGBClassifier - 97.1%
Sprawdźmy jak radzi sobie gdy usuniemy skorelowane chechy i dobierzemy inne hiperparametry.
X_small = X.drop(['Customer_Age', 'Total_Trans_Ct', 'Credit_Limit'], axis='columns')
deeper_xgb = xgb.XGBClassifier(n_estimators=100,
max_depth=5, random_state=14)
cv = StratifiedKFold(11, shuffle=True, random_state=14)
xgb_clfs = [deeper_xgb]
train_acc_dict, test_acc_dict, time_dict = estimates(X_small, y, xgb_clfs, cv)
test_acc_dict
[14:16:46] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [14:16:46] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [14:16:47] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [14:16:48] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [14:16:48] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [14:16:49] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [14:16:50] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [14:16:50] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [14:16:51] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [14:16:51] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [14:16:52] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
{'XGBClassifier': 0.9619847390488943}
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_small, y, random_state=14)
deeper_xgb.fit(X_train, y_train)
[14:16:53] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
importance_type='gain', interaction_constraints='',
learning_rate=0.300000012, max_delta_step=0, max_depth=5,
min_child_weight=1, missing=nan, monotone_constraints='()',
n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=14,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=None)
!pip install dalex -U
Requirement already up-to-date: dalex in /home/sawcio/Studia/4sem/Warsztaty_badawcze/wb-env/lib/python3.8/site-packages (1.0.1)
Requirement already satisfied, skipping upgrade: plotly>=4.12.0 in /home/sawcio/Studia/4sem/Warsztaty_badawcze/wb-env/lib/python3.8/site-packages (from dalex) (4.14.3)
Requirement already satisfied, skipping upgrade: pandas>=1.1.2 in /home/sawcio/Studia/4sem/Warsztaty_badawcze/wb-env/lib/python3.8/site-packages (from dalex) (1.2.2)
Requirement already satisfied, skipping upgrade: setuptools in /home/sawcio/Studia/4sem/Warsztaty_badawcze/wb-env/lib/python3.8/site-packages (from dalex) (47.1.0)
Requirement already satisfied, skipping upgrade: tqdm>=4.48.2 in /home/sawcio/Studia/4sem/Warsztaty_badawcze/wb-env/lib/python3.8/site-packages (from dalex) (4.59.0)
Requirement already satisfied, skipping upgrade: numpy>=1.18.4 in /home/sawcio/Studia/4sem/Warsztaty_badawcze/wb-env/lib/python3.8/site-packages (from dalex) (1.20.1)
Requirement already satisfied, skipping upgrade: six in /home/sawcio/Studia/4sem/Warsztaty_badawcze/wb-env/lib/python3.8/site-packages (from plotly>=4.12.0->dalex) (1.15.0)
Requirement already satisfied, skipping upgrade: retrying>=1.3.3 in /home/sawcio/Studia/4sem/Warsztaty_badawcze/wb-env/lib/python3.8/site-packages (from plotly>=4.12.0->dalex) (1.3.3)
Requirement already satisfied, skipping upgrade: pytz>=2017.3 in /home/sawcio/Studia/4sem/Warsztaty_badawcze/wb-env/lib/python3.8/site-packages (from pandas>=1.1.2->dalex) (2021.1)
Requirement already satisfied, skipping upgrade: python-dateutil>=2.7.3 in /home/sawcio/Studia/4sem/Warsztaty_badawcze/wb-env/lib/python3.8/site-packages (from pandas>=1.1.2->dalex) (2.8.1)
WARNING: You are using pip version 20.1.1; however, version 21.0.1 is available.
You should consider upgrading via the '/home/sawcio/Studia/4sem/Warsztaty_badawcze/wb-env/bin/python -m pip install --upgrade pip' command.
import dalex as dx
explainer = dx.Explainer(deeper_xgb, X_train, y_train)
Preparation of a new explainer is initiated -> data : 7595 rows 24 cols -> target variable : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray. -> target variable : 7595 values -> model_class : xgboost.sklearn.XGBClassifier (default) -> label : Not specified, model's class short name will be used. (default) -> predict function : <function yhat_proba_default at 0x7f69b9f41af0> will be used (default) -> predict function : Accepts pandas.DataFrame and numpy.ndarray. -> predicted values : min = 9.37e-06, mean = 0.841, max = 1.0 -> model type : classification will be used (default) -> residual function : difference between y and yhat (default) -> residuals : min = -0.779, mean = -5.26e-05, max = 0.7 -> model_info : package xgboost A new explainer has been created!
display(explainer.predict(X_test)[4], y_test.iloc[4])
0.9999571
1
pp = explainer.predict_parts(X_test.iloc[4,:])
pp
| variable_name | variable_value | variable | cumulative | contribution | sign | position | label | |
|---|---|---|---|---|---|---|---|---|
| 0 | intercept | 1 | intercept | 0.840606 | 8.406057e-01 | 1.0 | 23 | XGBClassifier |
| 1 | Total_Trans_Amt | 14240.0 | Total_Trans_Amt = 14240.0 | 0.986708 | 1.461025e-01 | 1.0 | 22 | XGBClassifier |
| 2 | Total_Relationship_Count | 2.0 | Total_Relationship_Count = 2.0 | 0.987994 | 1.286149e-03 | 1.0 | 21 | XGBClassifier |
| 3 | Total_Revolving_Bal | 1465.0 | Total_Revolving_Bal = 1465.0 | 0.990510 | 2.515256e-03 | 1.0 | 20 | XGBClassifier |
| 4 | Months_Inactive_12_mon | 1.0 | Months_Inactive_12_mon = 1.0 | 0.996665 | 6.155789e-03 | 1.0 | 19 | XGBClassifier |
| 5 | Total_Amt_Chng_Q4_Q1 | 0.63 | Total_Amt_Chng_Q4_Q1 = 0.63 | 0.999335 | 2.669394e-03 | 1.0 | 18 | XGBClassifier |
| 6 | Total_Ct_Chng_Q4_Q1 | 0.656 | Total_Ct_Chng_Q4_Q1 = 0.656 | 0.999560 | 2.249479e-04 | 1.0 | 17 | XGBClassifier |
| 7 | Avg_Utilization_Ratio | 0.407 | Avg_Utilization_Ratio = 0.407 | 0.999497 | -6.252527e-05 | -1.0 | 16 | XGBClassifier |
| 8 | Contacts_Count_12_mon | 2.0 | Contacts_Count_12_mon = 2.0 | 0.999944 | 4.463196e-04 | 1.0 | 15 | XGBClassifier |
| 9 | Months_on_book:Marital_Married | 48.0:1.0 | Months_on_book:Marital_Married = 48.0:1.0 | 0.999951 | 7.748604e-06 | 1.0 | 14 | XGBClassifier |
| 10 | Dependent_count | 3.0 | Dependent_count = 3.0 | 0.999960 | 8.761883e-06 | 1.0 | 13 | XGBClassifier |
| 11 | Gender | 0.0 | Gender = 0.0 | 0.999960 | 1.192093e-07 | 1.0 | 12 | XGBClassifier |
| 12 | Missing_Education | 1.0 | Missing_Education = 1.0 | 0.999954 | -6.496906e-06 | -1.0 | 11 | XGBClassifier |
| 13 | Avg_Open_To_Buy:Marital_Single | 2138.0:0.0 | Avg_Open_To_Buy:Marital_Single = 2138.0:0.0 | 0.999960 | 6.556511e-06 | 1.0 | 10 | XGBClassifier |
| 14 | Education_Level | 2.0 | Education_Level = 2.0 | 0.999956 | -4.291534e-06 | -1.0 | 9 | XGBClassifier |
| 15 | Income_Category | 1.0 | Income_Category = 1.0 | 0.999959 | 2.682209e-06 | 1.0 | 8 | XGBClassifier |
| 16 | Marital_Divorced | 0.0 | Marital_Divorced = 0.0 | 0.999957 | -1.609325e-06 | -1.0 | 7 | XGBClassifier |
| 17 | Card_Blue | 1.0 | Card_Blue = 1.0 | 0.999957 | 0.000000e+00 | 0.0 | 6 | XGBClassifier |
| 18 | Card_Gold | 0.0 | Card_Gold = 0.0 | 0.999957 | 0.000000e+00 | 0.0 | 5 | XGBClassifier |
| 19 | Card_Silver | 0.0 | Card_Silver = 0.0 | 0.999957 | 0.000000e+00 | 0.0 | 4 | XGBClassifier |
| 20 | Missing_Income | 0.0 | Missing_Income = 0.0 | 0.999957 | 0.000000e+00 | 0.0 | 3 | XGBClassifier |
| 21 | Marital_Unknown | 0.0 | Marital_Unknown = 0.0 | 0.999957 | 0.000000e+00 | 0.0 | 2 | XGBClassifier |
| 22 | Card_Platinum | 0.0 | Card_Platinum = 0.0 | 0.999957 | 0.000000e+00 | 0.0 | 1 | XGBClassifier |
| 23 | prediction | 0.999957 | 9.999571e-01 | 1.0 | 0 | XGBClassifier |
pp.plot()
pp_shap = explainer.predict_parts(X_train.iloc[4,:], type='shap')
pp_shap
| variable | contribution | variable_name | variable_value | sign | label | B | |
|---|---|---|---|---|---|---|---|
| 0 | Card_Silver = 0.0 | -0.000042 | Card_Silver | 0.0 | -1.0 | XGBClassifier | 1 |
| 1 | Missing_Education = 0.0 | 0.000313 | Missing_Education | 0.0 | 1.0 | XGBClassifier | 1 |
| 2 | Contacts_Count_12_mon = 2.0 | 0.007205 | Contacts_Count_12_mon | 2.0 | 1.0 | XGBClassifier | 1 |
| 3 | Income_Category = 1.0 | 0.000376 | Income_Category | 1.0 | 1.0 | XGBClassifier | 1 |
| 4 | Gender = 0.0 | -0.003118 | Gender | 0.0 | -1.0 | XGBClassifier | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 19 | Marital_Single = 1.0 | -0.000071 | Marital_Single | 1.0 | -1.0 | XGBClassifier | 0 |
| 20 | Card_Blue = 1.0 | 0.000060 | Card_Blue | 1.0 | 1.0 | XGBClassifier | 0 |
| 21 | Missing_Income = 0.0 | -0.000015 | Missing_Income | 0.0 | -1.0 | XGBClassifier | 0 |
| 22 | Card_Silver = 0.0 | -0.000015 | Card_Silver | 0.0 | -1.0 | XGBClassifier | 0 |
| 23 | Card_Platinum = 0.0 | 0.000000 | Card_Platinum | 0.0 | 0.0 | XGBClassifier | 0 |
624 rows × 7 columns
pp_shap.plot(max_vars=16)
Widzimy, że największy wpływ na predykcję miała zmienna Total_Trans_Amt. Spójrzmy na jej gęstość z podziałem na zmienną celu: Widać, że ta zmienna dosyć dobrze różnicuje klientów, którzy odeszli od tych którzy zostali.
Wyniki Break Down i Shapley Values różnią się od siebie, w szczególności:
Total_Ct_Chng_Q4_Q1, a dla Break Down wkład tej zmiennej wynosi 0.Total_Trans_Amt przyczyniła się o 0.146 i zdecydowanie ma największy wpływ na wynik, a Shapley Values rozdzielaja wkład na więcej zmiennych.for index,observation in enumerate(X_test.itertuples()):
pp = explainer.predict_parts(pd.DataFrame(observation).drop([0]).transpose(), type = 'break_down')
_vars = [pp.result.loc[1,:].variable_name, pp.result.loc[2,:].variable_name]
display(index)
display(_vars)
if index == 10:
break
0
['Total_Amt_Chng_Q4_Q1', 'Months_Inactive_12_mon']
1
['Total_Trans_Amt', 'Total_Ct_Chng_Q4_Q1']
2
['Total_Revolving_Bal', 'Total_Amt_Chng_Q4_Q1']
3
['Total_Revolving_Bal', 'Total_Ct_Chng_Q4_Q1']
4
['Total_Trans_Amt', 'Total_Revolving_Bal']
5
['Total_Trans_Amt', 'Months_Inactive_12_mon']
6
['Total_Amt_Chng_Q4_Q1', 'Contacts_Count_12_mon']
7
['Total_Trans_Amt', 'Total_Revolving_Bal']
8
['Total_Amt_Chng_Q4_Q1', 'Avg_Open_To_Buy']
9
['Total_Amt_Chng_Q4_Q1', 'Months_on_book']
10
['Total_Amt_Chng_Q4_Q1', 'Contacts_Count_12_mon']
pp = explainer.predict_parts(X_test.iloc[5,:], type = 'break_down')
pp.plot()
y_test.iloc[5]
1
pp = explainer.predict_parts(X_test.iloc[6,:], type = 'break_down')
pp.plot()
y_test.iloc[6]
0
| index | 5 | 6 |
|---|---|---|
| 1. najważniejsza zmienna | Total_Trans_Amt |
Total_Amt_Chng_Q4_Q1 |
| 2. najważniejsza zmienna | Months_Inactive_12_mon |
Contacts_Count_12_mon |
| predykcja i etykieta | klient pozostał | klient odszedł |
Te obserwacje nie tylko różnią się najważniejszymi zmiennymi, ale też predykcją i etykietą. Może to ma wpływ na wybór najważniejszych cech przez metodę Break Down. Spróbujmy wyjaśnić kilka obserwacji, które mają etykietę 0.
attrited = X_test[y_test==0].sample(5, random_state=997)
explainer.predict_parts(attrited.iloc[0,:], type = 'break_down').plot(max_vars=24)
explainer.predict_parts(attrited.iloc[1,:], type = 'break_down').plot(max_vars=24)
explainer.predict_parts(attrited.iloc[2,:], type = 'break_down').plot(max_vars=24)
Wśród próbki z obserwacji z etykietą 0 różne cechy występują jako najważniejsze. Kolejność cech nie zależy więc od etykiety i predykcji.
pp = explainer.predict_parts(X_test.iloc[55,:], type = 'break_down')
pp.plot(max_vars=24)
pp = explainer.predict_parts(X_test.iloc[122,:], type = 'break_down')
pp.plot(max_vars=24)
Dla 55 i 122 obserwacji cecha Contacts_Count_12_mon jest równa 3, jednak w pierwszym przypadku zwiększa predykcję o 0.003, a w drugim zmniejsza o 0.006.
Cechy takie jak rodzaj karty czy stan cywilny wpływają nieznacznie lub zupełnie nie wpływają na predykcję modelu. Możnaby spróbować usunąć je i sprawdzić jaką skuteczność osiąga model.